#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter("ignore")
#import datafile
vehicle=pd.read_csv("vehicle-1.csv")
vehicle.head()
#Data type of variables in data
vehicle.info()
vehicle.shape
#there are many columns which contain missing value
vehicle.isna().sum()
vehicle.describe()
#compactness, circularity, distance_circularity, radius_ratio, elongatedness, max.length_rectangularity,
#scaled_radius_of_gyration,skewness_about.2,hollows_ratio are approximately symmetric
#scatter_ratio, pr.axis_rectangularity, scaled_variance, scaled_variance.1,
#skewness_about, skewness_about.1 are moderately right skewed
#pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_radius_of_gyration.1 are highly right skewed
vehicle.skew()
#Compactness is aproxmiately symetric, its a mutlimodal distribution
sns.kdeplot(vehicle["compactness"])
#Circularity is aproxmiately symetric, its a mutlimodal distribution
sns.kdeplot(vehicle["circularity"])
#Distance_circularity contains mutlimodal distribution
sns.kdeplot(vehicle["distance_circularity"])
#radius ratio contains mutlimodal distribution. Contains many value distribution at peak
sns.kdeplot(vehicle["radius_ratio"])
#pr.axis_aspect_ratio is left skewed and has mutlimodal distribution
sns.kdeplot(vehicle["pr.axis_aspect_ratio"])
#max.length_aspect_ratio is left skewed and has bimodal distribution
sns.kdeplot(vehicle["max.length_aspect_ratio"])
#scatter_ratio has aprox symetric distribution with mutlimodal distribution
sns.kdeplot(vehicle["scatter_ratio"])
#elongatedness has aprox symetric distribution with mutlimodal distribution
sns.kdeplot(vehicle["elongatedness"])
#pr.axis_rectangularity is slightly left skewed with mutlimodal distribution
sns.kdeplot(vehicle["pr.axis_rectangularity"])
#max.length_rectangularity contains mutlimodal distribution
sns.kdeplot(vehicle["max.length_rectangularity"])
#scaled_variance is slightly left skewed with miltimodal distribution
sns.kdeplot(vehicle["scaled_variance"])
#scaled_variance.1 is left skewed with miltimodal distribution
sns.kdeplot(vehicle["scaled_variance.1"])
#scaled_radius_of_gyration contains miltimodal distribution
sns.kdeplot(vehicle["scaled_radius_of_gyration"])
#scaled_radius_of_gyration.1 is left skewed and conatins multimodal distribution
sns.kdeplot(vehicle["scaled_radius_of_gyration.1"])
#skewness_about is left skewed with multimodal disrtibution
sns.kdeplot(vehicle["skewness_about"])
#skewness_about.1 is left skewed with multimodal disrtibution
sns.kdeplot(vehicle["skewness_about.1"])
#skewness_about.2 conatins multimodal disrtibution
sns.kdeplot(vehicle["skewness_about.2"])
#hollows_ratio contains multimodal distibution
sns.kdeplot(vehicle["hollows_ratio"])
#Features is most cases have multimodal distribution
sns.boxplot(vehicle["compactness"])
sns.boxplot(vehicle["circularity"])
sns.boxplot(vehicle["distance_circularity"])
#radius_ration has samll number of outliers
sns.boxplot(vehicle["radius_ratio"])
#pr.axis_aspect_ratio has outliers
sns.boxplot(vehicle["pr.axis_aspect_ratio"])
#max.length_aspect_ratio contains outliers
sns.boxplot(vehicle["max.length_aspect_ratio"])
sns.boxplot(vehicle["scatter_ratio"])
sns.boxplot(vehicle["elongatedness"])
sns.boxplot(vehicle["pr.axis_rectangularity"])
sns.boxplot(vehicle["max.length_rectangularity"])
sns.boxplot(vehicle["scaled_variance"])
sns.boxplot(vehicle["scaled_variance.1"])
sns.boxplot(vehicle["scaled_radius_of_gyration"])
#scaled_radius_of_gyration.1 contains outliers
sns.boxplot(vehicle["scaled_radius_of_gyration.1"])
sns.boxplot(vehicle["skewness_about"])
sns.boxplot(vehicle["skewness_about.1"])
sns.boxplot(vehicle["skewness_about.2"])
sns.boxplot(vehicle["hollows_ratio"])
#Most of the feature distribution seems fine, while few conatin outliers
#Data has most cases of car, followed by bus and van
vehicle["class"].value_counts()
#We can see features distributed on class have different mean values
vehicle.groupby(["class"]).agg('mean')
#We can see features distributed on class have different median values
vehicle.groupby(["class"]).agg('median')
## mean and median have seprable values for 3 given class of vehicle
## will use median to compute blank variable as data is skewed
sns.boxplot(x='class',y='compactness',data=vehicle)
sns.boxplot(x='class',y='circularity',data=vehicle)
sns.boxplot(x='class',y='distance_circularity',data=vehicle)
sns.boxplot(x='class',y='radius_ratio',data=vehicle)
sns.boxplot(x='class',y='pr.axis_aspect_ratio',data=vehicle)
sns.boxplot(x='class',y='max.length_aspect_ratio',data=vehicle)
sns.boxplot(x='class',y='scatter_ratio',data=vehicle)
sns.boxplot(x='class',y='elongatedness',data=vehicle)
sns.boxplot(x='class',y='pr.axis_rectangularity',data=vehicle)
sns.boxplot(x='class',y='max.length_rectangularity',data=vehicle)
sns.boxplot(x='class',y='scaled_variance',data=vehicle)
sns.boxplot(x='class',y='scaled_variance.1',data=vehicle)
sns.boxplot(x='class',y='scaled_radius_of_gyration',data=vehicle)
sns.boxplot(x='class',y='scaled_radius_of_gyration.1',data=vehicle)
sns.boxplot(x='class',y='skewness_about',data=vehicle)
sns.boxplot(x='class',y='skewness_about.1',data=vehicle)
sns.boxplot(x='class',y='skewness_about.2',data=vehicle)
sns.boxplot(x='class',y='hollows_ratio',data=vehicle)
#as stated above featues have different values distribution, thus the classes are easily seprable. However we cannot
#distiguish among 2 cars.
## Filling blanks for respective col with median as per class
vehicle_col=['compactness', 'circularity', 'distance_circularity', 'radius_ratio', 'pr.axis_aspect_ratio',
'max.length_aspect_ratio', 'scatter_ratio','elongatedness', 'pr.axis_rectangularity',
'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', 'hollows_ratio']
for item in vehicle_col:
vehicle[item].fillna(vehicle.groupby('class')[item].transform('median'),inplace=True)
vehicle.info()
#Now there are no blanks in data
vehicle.isna().sum()
#Many features have high correlation value among each other
corr=vehicle.corr()
plt.subplots(figsize =(15, 10))
sns.heatmap(corr,cmap="YlGnBu",annot=True)
#many features are correlated to each other is visible form the pairplot graph
sns.pairplot(vehicle)
sns.pairplot(vehicle,hue='class')
#There are few variables which exibhit weak realtionship
#There is no variable which clearly distincts between 3 groups, data seems to be overlapping. However there peak value,
#or part of some distribution differs. Thus hope these features give good value when implemented in algorithm
#Many variables are highly correlated, thus can use PCA.
vehicle.groupby('class').quantile([.25, .5, .75, .90, .95, .99,1])
## There doesnt seem to be anything some unusual value in data, for now keeping data as is rather removing any outliers
#Seprating in X and Y
X_vehicle=vehicle.drop(['class'],1)
y_vehicle=vehicle['class']
print(X_vehicle.shape)
print(y_vehicle.shape)
#### Preprocessing of data for standardization
from sklearn.preprocessing import scale
X_std = scale(X_vehicle)
X_std=pd.DataFrame(X_std,columns=X_vehicle.columns)
X_std.head()
from sklearn.model_selection import train_test_split
X_train_std, X_test, y_train,y_test=train_test_split(X_std,y_vehicle,test_size=0.2,random_state=100)
from sklearn.decomposition import PCA
pca=PCA(svd_solver='randomized',random_state=10)
#Applying pca on train dataset
pca.fit(X_train_std)
pca.components_
pca.explained_variance_
pca.explained_variance_ratio_
colnames = list(X_vehicle.columns)
pcs_vehicle_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1],'PC3':pca.components_[2],
'PC4':pca.components_[3], 'PC5':pca.components_[4], 'PC6':pca.components_[5],
'PC7':pca.components_[6], 'PC8':pca.components_[7], 'PC9':pca.components_[8],
'PC10':pca.components_[9], 'PC11':pca.components_[10], 'PC12':pca.components_[11],
'PC13':pca.components_[12], 'PC14':pca.components_[13], 'PC15':pca.components_[14],
'PC16':pca.components_[15], 'PC17':pca.components_[16],'PC18':pca.components_[17],'Feature':colnames})
pcs_vehicle_df.head()
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (10,6))
plt.plot(range(1,19,1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
#Graph of individual explained variance and cumulative explained variance against Principal Components
fig = plt.figure(figsize = (10,6))
plt.bar(range(1,19), pca.explained_variance_ratio_, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),np.cumsum(pca.explained_variance_ratio_), where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
np.cumsum(pca.explained_variance_ratio_)
## 7 components explained almost 96% of varitation in data.
pca_final=PCA(n_components=7)
X_pca=pca_final.fit_transform(X_train_std)
X_train_pca=pd.DataFrame(X_pca,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])
type(X_train_pca)
X_train_pca.shape
#Applying on test data
X_test_pca=pca_final.transform(X_test)
X_test_pca=pd.DataFrame(X_test_pca,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'])
type(X_test_pca)
X_test_pca.shape
## there is very low correlation among the PCA variables
plt.figure(figsize=(10,10))
print(sns.heatmap(X_train_pca.corr(),annot=True))
print(sns.heatmap(X_test_pca.corr(),annot=True))
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
#Model Building using SVC - Support Vector Classifier
svc_1=SVC(C=1,kernel='linear',random_state=100)
svc_1.fit(X_train_std,y_train)
# predict
y_pred_test = svc_1.predict(X_test)
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred_test)
confusion
# measure accuracy
metrics.accuracy_score(y_test, y_pred_test)
# class-wise result without pca
#precision - a class labelled as car turns is correct 96% of the time, similarly for other classes as well.
#recall - car is actually recognized as car 94% of time by algo, similarly for other classes as well.
#So model does a good job with accuracy of 94%
class_wise = metrics.classification_report(y_true=y_test, y_pred=y_pred_test)
print(class_wise)
#Using PCA with SVC
#Model Building
svc_1.fit(X_train_pca,y_train)
# predict
y_pred_test_pca = svc_1.predict(X_test_pca)
#confusiuon matrix
confusion_pca = metrics.confusion_matrix(y_test,y_pred_test_pca)
confusion_pca
# measure accuracy
metrics.accuracy_score(y_test, y_pred_test_pca)
# class-wise result with pca - clearly results performance has decreased a lot. Maybe lets try increasing the PCA components
#from 7 to 9
class_wise_pca = metrics.classification_report(y_test, y_pred_test_pca)
print(class_wise_pca)
pca_final1=PCA(n_components=9)
X_pca1=pca_final1.fit_transform(X_train_std)
X_train_pca1=pd.DataFrame(X_pca1,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'])
#type(X_train_pca1)
#X_train_pca1.shape
#Applying on test data
X_test_pca1=pca_final1.transform(X_test)
X_test_pca1=pd.DataFrame(X_test_pca1,columns=['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'])
#type(X_test_pca1)
#X_test_pca1.shape
#Model Building
svc_1.fit(X_train_pca1,y_train)
# predict
y_pred_test_pca1 = svc_1.predict(X_test_pca1)
#confusiuon matrix
confusion_pca1 = metrics.confusion_matrix(y_test,y_pred_test_pca1)
confusion_pca1
# measure accuracy
metrics.accuracy_score(y_test, y_pred_test_pca1)
# class-wise
class_wise_pca1 = metrics.classification_report(y_test, y_pred_test_pca1)
print(class_wise_pca1)
##Clearly SVC without PCA gives better results with improvement in accuracy as well
#Using GridSearchCV first on SVC model
#Regularization using GridSearchCV
params = {"kernel": ['linear', 'rbf'],"gamma":[0.1, 1, 10, 100],'C': [0.5,1,10,100]}
from sklearn.model_selection import GridSearchCV
model_cv = GridSearchCV(estimator = svc_1, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv.fit(X_train_std, y_train)
# results of grid search CV
cv_results = pd.DataFrame(model_cv.cv_results_)
#cv_results
#parameters best value
best_score = model_cv.best_score_
best = model_cv.best_params_
best
#using best parameter values
model_best = SVC(kernel='rbf' ,gamma =0.1 ,C =10)
model_best.fit(X_train_std, y_train)
# predict
y_pred_best = model_best.predict(X_test)
#confusiuon matrix
confusion_gcv = metrics.confusion_matrix(y_test,y_pred_best)
confusion_gcv
# measure accuracy
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred_best))
# class-wise result with GridSearchCV selected model
class_wise_gcv = metrics.classification_report(y_test, y_pred_best)
print(class_wise_gcv)
#The model has improved with GridSearch. Now lets try GridSearch with 9 components of PCA
#Regularization using GridSearchCV
params = {"kernel": ['linear', 'rbf'],"gamma":[0.1, 1, 10, 100],'C': [0.5,1,10,100]}
model_cv1 = GridSearchCV(estimator = svc_1, param_grid = params,
scoring= 'accuracy',
cv=3,
verbose = 1,
return_train_score=True)
model_cv1.fit(X_train_pca1,y_train)
# results of grid search CV
cv_results1 = pd.DataFrame(model_cv1.cv_results_)
#cv_results1
#parameters best value
best_score1 = model_cv1.best_score_
best1 = model_cv1.best_params_
best1
#using best parameter values
model_best1 = SVC(kernel='rbf' ,gamma =0.1 ,C =10)
model_best1.fit(X_train_pca1,y_train)
# predict
y_pred_best1 = model_best1.predict(X_test_pca1)
#confusiuon matrix
confusion_gcv1 = metrics.confusion_matrix(y_test,y_pred_best1)
confusion_gcv1
# measure accuracy
metrics.accuracy_score(y_test, y_pred_best1)
# class-wise result
class_wise_gcv1 = metrics.classification_report(y_test, y_pred_best1)
print(class_wise_gcv1)
#When used GridSerach CV with PCA-9 componenets it gives compareable results when used GridSearch CV with svc algo only.
#We can consider this to be better as it uses less no of features thus making it less complex.
#The vehicles chosen are readily distinguishable bus, van and car.
#But there is no seperablity shown in data between 2 cars.
#GridSearch CV with SVC
#precision - a class labelled as car turns is correct 96% of the time, similarly for other classes as well.
#recall - car is actually recognized as car 95% of time by algo, similarly for other classes as well.
#So model does a good job with accuracy of 95.2%
#Accuracy: 0.9529411764705882
# precision recall f1-score support
#
# bus 0.96 0.98 0.97 48
# car 0.96 0.95 0.96 80
# van 0.93 0.93 0.93 42
# micro avg 0.95 0.95 0.95 170
# macro avg 0.95 0.95 0.95 170
#weighted avg 0.95 0.95 0.95 170
#GridSearch CV with SVC and PCA-9
#precision - a class labelled as car turns is correct 96% of the time, similarly for other classes as well.
#recall - car is actually recognized as car 91% of time by algo, similarly for other classes as well.
#So model does a good job with accuracy of 93.5%
#Accuracy: 0.9352941176470588
# precision recall f1-score support
#
# bus 0.94 0.98 0.96 48
# car 0.96 0.91 0.94 80
# van 0.89 0.93 0.91 42
# micro avg 0.94 0.94 0.94 170
# macro avg 0.93 0.94 0.93 170
#weighted avg 0.94 0.94 0.94 170